library(nycflights13)
library(mosaicData)
library(janitor)
library(tidyr)
library(broom)
library(mdsr)
library(RColorBrewer)
library(osmdata)
library(ggmap)
library(zipcodeR)
library(tidyverse)
library(paletteer)
library(gt)(a) (Excerpted from Problem 4.9 in textbook) Use the flights table. What month had the highest proportion of cancelled flights (flights with missing departure/arrival delay time)? What month had the lowest? Interpret any seasonal patterns.
Here I defined a cancelled flights as a flight that never left, didn’t have any air time and that never arrived to its destination.
# Find data with na's
cancelled_flights <-
flights %>%
filter(is.na(flights$dep_time) & is.na(flights$arr_time) &
is.na(flights$air_time)) %>%
select(month,day,arr_time,air_time,dep_time)#Find flights cancelled
number_flights_cancelled <-
cancelled_flights %>%
group_by(month) %>%
summarise(n_flights_cancelled = n())
# Find total_number_flights
total_number_flights <-
flights %>%
group_by(year,month) %>%
summarise(n_flights_total = n())
# Join tables
proportion_flights_cancelled <-
inner_join(number_flights_cancelled,total_number_flights,
by = c("month"))proportion_flights_cancelled %>%
mutate(proportion_flight_cancelled =
((n_flights_cancelled/n_flights_total))) %>%
round(2) %>%
arrange(desc(proportion_flight_cancelled)) %>%
rename(Month = "month",flights_cancelled = "n_flights_cancelled",
Total_flights = "n_flights_total",
Proportion_flights_cancelled = "proportion_flight_cancelled") %>%
select(year,everything()) %>%
gt() %>%
fmt_percent(Proportion_flights_cancelled, decimals = 0)| year | Month | flights_cancelled | Total_flights | Proportion_flights_cancelled |
|---|---|---|---|---|
| 2013 | 2 | 1261 | 24951 | 5% |
| 2013 | 6 | 1009 | 28243 | 4% |
| 2013 | 12 | 1025 | 28135 | 4% |
| 2013 | 3 | 861 | 28834 | 3% |
| 2013 | 7 | 940 | 29425 | 3% |
| 2013 | 1 | 521 | 27004 | 2% |
| 2013 | 4 | 668 | 28330 | 2% |
| 2013 | 5 | 563 | 28796 | 2% |
| 2013 | 8 | 486 | 29327 | 2% |
| 2013 | 9 | 452 | 27574 | 2% |
| 2013 | 10 | 236 | 28889 | 1% |
| 2013 | 11 | 233 | 27268 | 1% |
(b) Challenge: Use the weather table. On how many days was there precipitation in the New York area for each month in 2013? What do you observe in combine with the results from part (a)? (Hint: the distinct function in dplyr can be useful.)
weather %>%
group_by(year,month,day) %>%
mutate(
origin = as.factor(origin),
year = as.factor(year),
month = as.factor(month),
day = as.factor(day),
hour = as.factor(hour)) %>%
summarize(across(where(is.numeric), mean , na.rm = T)) %>%
filter(precip > 0) %>%
group_by(month) %>%
summarise(days_with_rain = n()) %>%
arrange(desc(days_with_rain)) %>%
rename(Month = "month",Days_with_rain = "days_with_rain") %>%
gt() | Month | Days_with_rain |
|---|---|
| 6 | 16 |
| 1 | 14 |
| 2 | 14 |
| 7 | 13 |
| 12 | 13 |
| 4 | 12 |
| 5 | 12 |
| 8 | 11 |
| 11 | 11 |
| 10 | 9 |
| 3 | 8 |
| 9 | 8 |
(Excerpted from Problem 5.4 in textbook) Use the flights and plane tables. What is the oldest plane (specified by the tailnum variable) that flew from New York City airports in 2013
flights %>%
select(-year) %>%
inner_join(planes, by = c('tailnum' = 'tailnum')) %>%
select(tailnum,year, everything()) %>%
group_by(tailnum,year) %>%
summarise(Mean_airtime = mean(air_time)) %>%
arrange(year) %>%
drop_na() %>%
head(5)# A tibble: 5 × 3
# Groups: tailnum [5]
tailnum year Mean_airtime
<chr> <int> <dbl>
1 N381AA 1956 280.
2 N14629 1965 208.
3 N615AA 1967 180.
4 N364AA 1973 306.
5 N675MC 1975 95.8
(a) (Excerpted from Problem 6.6 in textbook) The HELPfull data contains information about the Health Evaluation and Linkage to Primary Care (HELP) randomized trial in tall format. Create a table that each row displays the DRUGRISK and SEXRISK scores at the baseline and 6 months for a subject ID. (Hint: See the textbook for breakdown steps.)
# Load data
data("HELPfull")HELPfull %>%
clean_names() %>%
filter(id == 1) %>%
filter(time %in% c(0,6)) %>%
pivot_longer(cols = c(drugrisk,sexrisk),
names_to = "type_of_risk",
values_to = "risk_score") %>%
select(id,type_of_risk,risk_score,time,everything()) %>%
arrange(type_of_risk) %>%
select(1:9) %>%
gt()| id | type_of_risk | risk_score | time | num_intervals | int_time1 | days_since_bl | int_time2 | days_since_prev |
|---|---|---|---|---|---|---|---|---|
| 1 | drugrisk | 0 | 0 | 1 | 0.000000 | NA | 6.000000 | NA |
| 1 | drugrisk | 0 | 6 | 1 | 8.033333 | 241 | 8.033333 | 241 |
| 1 | sexrisk | 4 | 0 | 1 | 0.000000 | NA | 6.000000 | NA |
| 1 | sexrisk | 1 | 6 | 1 | 8.033333 | 241 | 8.033333 | 241 |
(b) (Excerpted from Problem 7.3 in textbook) Use the purrr::map() function and the HELPrct data frame to fit a regression model predicting cesd as a function of age separately for each of the levels of the substance variable. Generate a list of results (estimates and confidence intervals) for the slope parameter for each level of the grouping variable. (Hint: The tidy() function with the option conf.int = T computes confidence intervals for a lm() object.)
table <-
HELPfull %>%
clean_names() %>%
select(ces_d,age,secd_sub) %>%
na.omit() %>%
mutate(secd_sub = as.factor(secd_sub)) %>%
nest(data = -secd_sub) %>%
mutate(
fit = map(data, ~ lm(ces_d ~ age, data = .x)),
tidied = map(fit, tidy, conf.int =T)) %>%
unnest(tidied) %>%
select(-c(data,fit)) %>%
filter(term %in% "age") %>%
clean_names() %>%
arrange(secd_sub) %>%
drop_na()
levels(table$secd_sub) <- c("None","Alcohol","Cocaine" ,
"Heroine","Barbituates", "Benzos",
"Marijuana", "Methadone", "Opiates")table %>% gt()| secd_sub | term | estimate | std_error | statistic | p_value | conf_low | conf_high |
|---|---|---|---|---|---|---|---|
| None | age | 0.13582038 | 0.1126537 | 1.2056446 | 0.22934428 | -0.08628813 | 0.35792889 |
| Alcohol | age | -0.28330979 | 0.1687487 | -1.6788858 | 0.09598731 | -0.61769658 | 0.05107701 |
| Cocaine | age | 0.06950926 | 0.1565577 | 0.4439850 | 0.65819510 | -0.24182287 | 0.38084139 |
| Heroine | age | -0.11506198 | 0.3496772 | -0.3290520 | 0.74613547 | -0.85281631 | 0.62269236 |
| Benzos | age | 0.05769718 | 0.3339358 | 0.1727793 | 0.86771459 | -0.73193555 | 0.84732992 |
| Marijuana | age | -0.28415183 | 0.3220931 | -0.8822041 | 0.38492071 | -0.94290610 | 0.37460244 |
| Opiates | age | -0.91494845 | 0.8883456 | -1.0299465 | 0.49060905 | -12.20245011 | 10.37255320 |
(a) Read the dataset from CSVs hosted on GitHub using read_csv().(Hint: you want to skip the first two lines with the skip option and read up to the 52nd state with the n_max option.)
link <- "https://raw.githubusercontent.com/opencasestudies/ocs-healthexpenditure/master/data/KFF/healthcare-coverage.csv"
data_csv <-
read.csv(file = link, skip = 2, nrows = 52,
na.strings = "N/A") %>%
clean_names()
# Remove x from colnames
colnames(data_csv) <- sub("^x", "", colnames(data_csv))(b) Convert all year-based columns to integer using mutate(across(…)) . (Hint: Read Section 7.2 in the textbook for the across() function.)
data_csv <-
data_csv %>%
mutate(across(starts_with("201"), as.integer))(c) Further tidy the dataset and convert it to a long data format as shown below.
data_csv <-
data_csv %>%
pivot_longer(-location, names_to = "year",
values_to = "tot_coverage") %>%
separate(year,c("year","type"),extra = "merge") %>%
mutate(year = factor(year),
type = factor(type),
type = str_to_sentence(type))
data_csv$type <- sub("_", "-", data_csv$type)
head(data_csv, 10) %>%
gt()| location | year | type | tot_coverage |
|---|---|---|---|
| United States | 2013 | Employer | 155696900 |
| United States | 2013 | Non-group | 13816000 |
| United States | 2013 | Medicaid | 54919100 |
| United States | 2013 | Medicare | 40876300 |
| United States | 2013 | Other-public | 6295400 |
| United States | 2013 | Uninsured | 41795100 |
| United States | 2013 | Total | 313401200 |
| United States | 2014 | Employer | 154347500 |
| United States | 2014 | Non-group | 19313000 |
| United States | 2014 | Medicaid | 61650400 |
The Violations data set in the mdsr package contains information regarding the outcome of health inspections of restaurants in New York City. The ViolationCodes data set includes violation description and classification of critical violations (violations that are most likely to contribute to foodborne illness). See original data source for more information: NYC Open Data. Use these data to calculate, by zip code, the average number of inspections per restaurant and the rate of critical violations. What pattern do you see between the average number of inspections per restaurant and the rate of critical violations? (Hint: Note that an inspection can appear across several rows in the Violations data set if multipleviolations were associated with the inspection. The rate of critical violations should be defined as the rate of inspections that result in at least one critical violation.)
# Load data
data("ViolationCodes")
data("Violations")# Join ViolationCodes and Violations datasets
data_nyc_restaurants <-
inner_join(ViolationCodes,Violations,
by = c("violation_code" = "violation_code")) %>%
# Clean dataset delete columns
select(-c(violation_description,cuisine_code,phone,action,
street,grade_date,boro,record_date,
grade,inspection_type,violation_code,building)) %>%
drop_na() %>%
mutate(across(where(is.character), as.factor),
camis = factor(camis)) %>%
select(zipcode,camis,dba,inspection_date,everything()) %>%
arrange(zipcode,camis,inspection_date) data_per_rest <-
data_nyc_restaurants %>%
drop_na() %>%
group_by(zipcode,camis,inspection_date) %>%
#Create critical flags column
mutate(n_critical_flags =
ifelse(critical_flag == "Critical",TRUE,FALSE)) %>%
filter(n_critical_flags > 0) %>%
summarise(number_of_critical_flags =
sum(n_critical_flags == 'TRUE'),
score = mean(score)) %>%
arrange(zipcode,camis)data_number_visits_per_rest <-
data_per_rest %>%
# How many times a restaurant was visit?
group_by(zipcode,camis) %>%
summarise(number_visits = n(),
mean_score = mean(score),
total_number_of_critical_flags =
sum(number_of_critical_flags)) %>%
arrange(zipcode,camis) %>%
select(zipcode,camis,
total_number_of_critical_flags,everything())data_agregated_by_zipcode <-
data_number_visits_per_rest %>%
#select(-camis) %>%
group_by(zipcode) %>%
summarise(mean_inspector_visits = mean(number_visits),
mean_inspector_score = mean(mean_score),
mean_critical_violations =
mean(total_number_of_critical_flags) ) %>%
arrange(zipcode) data_agregated_by_zipcode %>%
head(10)# A tibble: 10 × 4
zipcode mean_inspector_visits mean_inspector_score mean_critical_violations
<int> <dbl> <dbl> <dbl>
1 10001 5.43 14.5 10.3
2 10002 5.59 16.1 11.5
3 10003 5.77 15.4 11.5
4 10004 5.29 14.0 9.44
5 10005 5.59 15.2 10.9
6 10006 6.53 15.5 13.0
7 10007 5.44 14.9 10.4
8 10009 5.95 15.7 11.5
9 10010 5.54 15.7 10.9
10 10011 5.71 15.0 11.1
What pattern do you see between the average number of inspections per restaurant and the rate of critical violations? (Hint: Note that an inspection can appear across several rows in the Violations data set if multiple violations were associated with the inspection)
ggplot(data = data_agregated_by_zipcode ,
aes(x = mean_inspector_visits, y = mean_critical_violations)) +
geom_point() +
geom_smooth(method = lm) +
theme(axis.text.y = element_text(size = 14),
# Legend position and Axis size
axis.text.x = element_text(size = 14),
axis.title.y = element_text(size = 14),
axis.title.x = element_text(size = 14),
# Add borders to the plot
panel.border = element_rect(colour = "black",
fill= NA,size = 1.3)) +
ylab("Critical Violations") + xlab("Inspector Visits")# Get lat long for each zipcode
zip_long_lat <- geocode_zip(data_agregated_by_zipcode$zipcode) %>%
mutate(zipcode = as.integer(zipcode))# Join agregated data and lat long data
nyc <-
inner_join(data_agregated_by_zipcode,zip_long_lat,
by = c("zipcode" = "zipcode")) %>%
select(zipcode, lat,lng, everything()) %>%
drop_na()# Get map and set palette
nyc_map <- get_map(getbb("New York"), maptype = "toner-background")
my_palette <- colorRampPalette(rev(brewer.pal(11, "Spectral")))ggmap(nyc_map) +
geom_point(data = nyc, aes(x = lng, y = lat,
color = mean_critical_violations),
size = 9) +
scale_colour_gradientn(colours = my_palette(10), limits=c(1, 22)) +
theme(legend.position = "bottom",
legend.key.size = unit(2, 'cm'),
legend.title = element_text(size=30),
legend.text = element_text(size=30)) +
labs(color = 'Critical \n Violations')